
*The following procedure converges but it is a heywood case.
  
*do parallel analysis in each step?

*use entire audit dataset?
*ssc install factortest
*net install polychoric, from(http://staskolenikov.net/stata)
*ssc install subsetbyvif, update
*net install st0249.pkg
*do this just once
*mata: mata mlib index
/*
ssc install grstyle, replace

set scheme s1color
*net install labloadingplot, from(http://www.radyakin.org/stata/labloadingplot/)
grstyle init
*grstyle set color lpattern
*grstyle set ptol, rainbow n(8) reverse*/

clear all
set scheme plotplainblind


mata: mata clear


*USE THE TEST DATA:
use "C:/Users/awcassidy1\Dropbox\jmp_new\cleaned_data/test_data.dta", clear


gen negleakavg_frac=negleakavg/100

set seed 123456

replace negleakavg=negleakavg_frac





cap drop negrrec
cap drop negwinrec
gen negrrec=-rrec
gen negwinrec=-winrec


cap drop notraddr
gen notraddr=0
replace notraddr=1 if rrec==0
rename waterheaterfueltype whftype
rename furnacefueltype fftype
rename waterheatertanktype wh_tanktype
rename ductsystem1type duct_type

cap drop ah_type
cap drop ah_location
rename system1airhandlertype ah_type
rename system1locationairhandler ah_location




local keeplist_finaldatasetvars    ///
	   ah_location  ///	 
	  ah_type  ///
	  progtherm  ///
	rsavg   duct_type  ///
	negsysageavg ///
	sizeavg ///
	  eeravg  ///
	  negleakavg atticrvalue ductravg ///
	   ///
	  whftype  ///
	  fftype wh_tanktype  twosystems notwinrec notraddr
	  
local less_observable_vars  ductravg duct_type negleakavg ///
		rsavg atticrvalue eeravg 
		
local other_less_observable_vars notwinrec ah_type
local more_observable_vars  sizeavg progtherm ///
		fftype whftype negsysageavg  ah_location
local other_more_observable_vars notraddr twosystems 

local less_observable_vars `less_observable_vars' `other_less_observable_vars'
		
local more_observable_vars `more_observable_vars' `other_more_observable_vars'


sum `keeplist_finaldatasetvars'

local all_observable_vars `less_observable_vars' `more_observable_vars' 

local keeplist_finaldatasetvars    ///
	   ah_location  ///	  *these on this line  were not there before ah_type 
	  ah_type  ///
	  progtherm  ///
	rsavg   duct_type  ///
	negsysageavg ///
	sizeavg ///
	  eeravg  ///
	  negleakavg atticrvalue ductravg ///
	   ///
	  whftype  ///
	  fftype wh_tanktype twosystems notwinrec notraddr
	  
egen mean_ee=rowmean(`keeplist_finaldatasetvars')
pwcorr `keeplist_finaldatasetvars' mean_ee

alpha `keeplist_finaldatasetvars', item asis



foreach v in `keeplist_finaldatasetvars' {
	keep if !missing(`v')
}



********************************************************************************
*STANDARDIZE OR SCALE
********************************************************************************

	
foreach v in `all_observable_vars' {
	sum `v', detail
	*stdize
	replace `v'= (`v'-r(mean))/(r(sd))
	la var `v' "`: var label `v'_st'"

	la var `v' "`: var label `v'_st'"


	}	
	
	
la var notwinrec "Not Rec WS"

la var notraddr "Not Rec Add Ins"

la var ah_type "Vertical AH"
la var duct_type "Metal Ducts"	

la var whftype "Gas WH"
la var fftype "Gas Furnace"
la var negsysageavg "- System Age"
la var sizeavg "Size (Sqft/Ton)"
la var progtherm "Programmable Thermostat"
la var twosystems "2 Systems"
la var ah_location "AH in Closet"

alpha `less_observable_vars', item asis 
alpha `more_observable_vars', item asis 
alpha `less_observable_vars' `more_observable_vars', item asis 

alpha `all_observable_vars', item std

factor `all_observable_vars', blanks(.3) factors(3) ml
rotate, oblique quartimin blanks(.3)


local all_observable_vars `less_observable_vars' `more_observable_vars'


egen mean=rowmean(`less_observable_vars' `more_observable_vars')
sum mean
local std=r(sd)
local variance=(`std')^2
di "variance=`variance'"

egen mean_less=rowmean(`less_observable_vars')
sum mean_less
local std=r(sd)
local variance=(`std')^2
di "variance_less=`variance'"

egen mean_more=rowmean(`more_observable_vars')
sum mean_more
local std=r(sd)
local variance=(`std')^2
di "variance_more=`variance'"


egen sum=rowtotal(`less_observable_vars' `more_observable_vars')
sum mean
local std=r(sd)
local variance=(`std')^2
di "variance=`variance'"

egen nm_less=rownonmiss(`less_observable_vars')

egen sum_less=rowtotal(`less_observable_vars') if nm_less==8
sum sum_less
local std=r(sd)
local variance=(`std')^2
di "variance_sum_less=`variance'"

egen nm_more=rownonmiss(`more_observable_vars') 


egen sum_more=rowtotal(`more_observable_vars') if nm_more==8
sum sum_more
local std=r(sd)
local variance=(`std')^2
di "variance_sum_more=`variance'"

local start_cut_less=3
local start_cut_more=3
local start_cut_all=4

local components_less=1
local components_more=2
local components_all=3


local reps=500
local folds=10

local max_cut=17

local reps_for_stab=200

local facname_less="Less"
local facname_more="More"
local facname_all="All"

local percent_less=30
local percent_more=40
local percent_all=50

*set seed 123456
*less more
local descriptorlist "less more"
foreach descriptor in `descriptorlist' {
	local start_cut=`start_cut_`descriptor''
	local percent=`percent_`descriptor''
	local c=`components_`descriptor''
	
	*problematically, these figs disrupt the sort order and store stuff in mata.
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`max_cut') 
	
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	ttscree, neigen(8) title("")
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/scree_`descriptor'_mc`max_cut'.pdf", replace as(pdf)

	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`start_cut') components(`c') 
	
		
		
	*change problematic labels
	la var negleakavg "- % Duct Leakage"
	
	*get the groups
	*the first variable is in position 0
	*then we need to get the other ones ignoring 0s and 9s.
	
	local labellist=" 1 "
	*the first variable in the list goes first

	

	

	*the tt dendro with labels code needs a distmat and wordlist which is the vars
	*that went into the tt command.
	*it also needs a custom title.
	local wordlist="``descriptor'_observable_vars'"
	mat distmat=e(distmat)
	local title=""
	include "C:/Users/awcassidy1\Dropbox\jmp_new\programs/code_to_call/ttdendro_with_labels"

 

	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/dendro_`descriptor'_withlabels.pdf", replace as(pdf)

	*can double check that labels worked:
	ttdendro
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/dendro_`descriptor'_nolabels.pdf", replace as(pdf)

	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`start_cut') components(`c')

	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	ttscree, neigen(8)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/scree_`descriptor'_sc`start_cut'.pdf", replace as(pdf)

	ereturn list
	
	*start with max cut.
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`max_cut')  components(`c')
	ttcv  ``descriptor'_observable_vars', components(`c') reps(`reps') folds(`folds') percent(`percent')
	local optimal_max=e(cvknee)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/cross_validation_`descriptor'_sc`start_cut'_r`reps'_f`folds'_mc`max_cut'om`optimal_max'.pdf", replace as(pdf)
	
	*next try start cut.	
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`start_cut')  components(`c')
	ttcv  ``descriptor'_observable_vars', components(`c') reps(`reps') folds(`folds') percent(`percent')
	local optimal=e(cvknee)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/cross_validation_`descriptor'_sc`start_cut'_r`reps'_f`folds'.pdf", replace as(pdf)
	
	
	*next try starting with the optimal cut
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`optimal') components(`c')
	ttcv  ``descriptor'_observable_vars', components(`c') reps(`reps') folds(`folds') percent(`percent')
	local optimal=e(cvknee)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/cross_validation_`descriptor'_sc`start_cut'_r`reps'_f`folds'_mc`max_cut'o`optimal'.pdf", replace as(pdf)
	
	*export cross validation for paper
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`optimal') components(`c')
	ttcv  ``descriptor'_observable_vars', components(`c') reps(`reps') folds(`folds') percent(`percent') title("")
	local optimal=e(cvknee)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/cross_validation_`descriptor'_r`reps'_f`folds'o`optimal'.pdf", replace as(pdf)
	
	
	*export scree for paper
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`optimal') components(`c')
	ttscree, neigen(8) title("")
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/scree_`descriptor'_o`optimal'.pdf", replace as(pdf)
	
	*now, does optimal give us back optimal?
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`optimal') components(`c')
	ttcv  ``descriptor'_observable_vars', components(`c') reps(`reps') folds(`folds') percent(`percent')
	local re_optimal=e(cvknee)
	graph export "C:/Users/awcassidy1\Dropbox\jmp_new\figs/cross_validation_`descriptor'_sc`start_cut'_r`reps'_f`folds'_mc`max_cut'o`optimal_max'_r`re_optimal'.pdf", replace as(pdf)
	
	di "optimal=`optimal'"
	di "re_optimal=`re_optimal'"
	
	local optimal=`re_optimal'
	
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	tt ``descriptor'_observable_vars',  cut(`optimal')  components(`c')
	*get list of variables in first component

	local `descriptor'_observable_extracted=""
	local semphrase_`descriptor'=""
	forval componentnum=1/`components_`descriptor'' {
		local `descriptor'_observable_extracted_`componentnum'=""
		local k=1
		foreach v in ``descriptor'_observable_vars' {
			local loading=e(basis)[`k',`componentnum']
			di "loading=`loading'"
			if `loading'!=0 {
				local `descriptor'_observable_extracted_`componentnum'= "``descriptor'_observable_extracted_`componentnum'' `v'"
				}
			local k=`k'+1
			}
		di "`descriptor'_observable_extracted_`componentnum'=``descriptor'_observable_extracted_`componentnum''"
		local `descriptor'_observable_extracted= "``descriptor'_observable_extracted' ``descriptor'_observable_extracted_`componentnum''"
		di "`descriptor'_observable_extracted= ``descriptor'_observable_extracted'"

		di "optimal=`optimal'"
		
			}
			
		di "`descriptor'_observable_extracted=``descriptor'_observable_extracted'"
		di "semphrase_`descriptor'=`semphrase_`descriptor''"

	****************************************************************************
	*get semphrase putting all the variables on one factor.
	****************************************************************************

	local words_in_`descriptor'=`: word count ``descriptor'_observable_extracted''
	
	if `words_in_`descriptor'' > 2 & `words_in_`descriptor''<20 {
		local word1="`: word 1 of ``descriptor'_observable_extracted''"
		local otherwords=subinstr("``descriptor'_observable_extracted'","`word1'","",1)
		local semphrase_`descriptor'="(`word1'<- `facname_`descriptor''@1)(`otherwords' <- `facname_`descriptor'')"
		}
	if `words_in_`descriptor'' == 2 {
		local word1="`: word 1 of ``descriptor'_observable_extracted''"
		local word2="`: word 2 of ``descriptor'_observable_extracted''"

		local semphrase_`descriptor'="(`word1' <- `facname_`descriptor''@a) (`word2' <- `facname_`descriptor''@a)"
		}	
	if `words_in_`descriptor''== 1 {
		di "ERROR ONLY 1 in FACTOR!"
		stop
		}
		
	set seed 123456
	sort _all, stable
	mata: mata clear
	set seed 123456
	ttstab, reps(`reps_for_stab') keep(10)

	
	}
	
save "C:/Users/awcassidy1\Dropbox\jmp_new\cleaned_data/temp.dta", replace
